import pandas as pd
import zipfile
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler , MinMaxScaler ,RobustScaler
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split , cross_validate ,KFold
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import seaborn as sns
from scipy.stats import boxcox
import matplotlib.pyplot as plt
%%capture
!pip install catboost logzero
import pandas as pd
import numpy as np
import os
import random
import requests, zipfile
from lightgbm import LGBMClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.metrics import hamming_loss
from logzero import logger
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
zifile_name = "playground-series-s3e26.zip"
with zipfile.ZipFile(zifile_name,'r') as file :
file.extractall()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')
import gc
def reduce_mem_usage(df):
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
df[col] = df[col].astype(np.uint8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
df[col] = df[col].astype(np.uint16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
df[col] = df[col].astype(np.uint32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
df[col] = df[col].astype(np.uint64)
elif str(col_type)[:5] == 'float':
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
gc.collect()
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
# Set the seed value all over the place to make this reproducible.
def seed_all(SEED=42):
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
seed_all()
class DataProcessing :
flags = ['ip.dsfield.ecn','ip.flags.df',]
list_cols = ['ip.len','ip.ttl','tcp.offset','tcp.options.timestamp.tsval','tcp.window_size_value',
'tls.handshake.extensions_length','frame.time_epoch']
tcp_flags = ['CWR','ECE','SYN','ACK','PSH']
cert_curve_uniques = ['prime256v1','secp384r1']
def __init__(self,train,test) :
self.train = train
self.test = test
def process_list_cols(self,data) :
for col in self.list_cols :
#cleaning
data[col] = data[col].apply(lambda x : x.strip('[').strip(']').split(', '))
data[col] = data[col].apply(lambda x : ["'0'" if y=="''" else y for y in x])
data[col] =data[col].apply(lambda x : [float(y.strip("'")) for y in x])
#statistics features
data[f'{col}_max'] = data[f'{col}'].apply(lambda x : np.max(x))
data[f'{col}_min'] = data[f'{col}'].apply(lambda x : np.min(x))
data[f'{col}_len'] = data[f'{col}'].apply(lambda x : len(x))
data[f'{col}_mean'] = data[f'{col}'].apply(lambda x : np.mean(x))
data[f'{col}_median'] = data[f'{col}'].apply(lambda x : np.median(x))
data[f'{col}_std'] = data[f'{col}'].apply(lambda x : np.std(x))
return data
def process_flags_cols(self,data) :
for c in self.flags :
if c=='ip.dsfield.ecn' :
data[f'{c}'] = data[f'{c}'].apply(lambda x : x.strip('[').strip(']').split(', '))
data[f'{c}'] = data[f'{c}'].apply(lambda x : [int(y.strip("'")) for y in x])
data[f'{c}_0_count'] = data[f'{c}'].apply(lambda x : x.count(0) )
data[f'{c}_2_count'] = data[f'{c}'].apply(lambda x : x.count(2) )
data[f'{c}_mean'] = data[f'{c}'].apply(lambda x : np.mean(x) )
data[f'{c}_std'] = data[f'{c}'].apply(lambda x : np.std(x) )
else :
data[f'{c}'] = data[f'{c}'].apply(lambda x : x.strip('[').strip(']').split(', '))
data[f'{c}'] = data[f'{c}'].apply(lambda x : [int(y.strip("'")) for y in x])
data[f'{c}_0_count'] = data[f'{c}'].apply(lambda x : x.count(0) )
data[f'{c}_1_count'] = data[f'{c}'].apply(lambda x : x.count(1) )
data[f'{c}_mean'] = data[f'{c}'].apply(lambda x : np.mean(x) )
data[f'{c}_std'] = data[f'{c}'].apply(lambda x : np.std(x) )
return data
def process_tcp_flags(self,data) :
for c in self.tcp_flags :
data[f'tcp_flag_{c}'] = (data['tcp.flags'] ==c)*1
data[f'tcp_flag_{c}_count'] = data['tcp.flags'].apply(lambda x : x.count(c))
return data
def FE(self,data) :
data['tcp.options.timestamp.tsval_diff'] = train['tcp.options.timestamp.tsval_max'] - data['tcp.options.timestamp.tsval_min']
data['packet_directions'] = data['packet_directions'].apply(lambda x : x.strip('[').strip(']').split(', '))
data['packet_directions_I_count'] = data['packet_directions'].apply(lambda x : x.count("'I'") )
data['packet_directions_O_count'] = data['packet_directions'].apply(lambda x : x.count("'O'") )
data['packet_directions_len'] = data['packet_directions'].apply(lambda x : len(x) )
for c in self.cert_curve_uniques :
data[c] = data['cert.curve'].fillna('').str.contains(c)*1
data['cert.curve_len'] = data['cert.curve'].fillna('').apply(lambda x : len(x))
#ohe Encode
tls_vers = ['TLS 1.3','TLS 1.2','Reserved (GREASE)','TLS 1.1','TLS 1.0',]
for c in tls_vers :
data[c] = data['tls.handshake.extensions.supported_version.ch'].str.contains(c)*1
#ohe
tls_ch = ['h2','http/1.1']
for c in tls_vers :
data[c] = data['tls.handshake.extensions_alpn_str.ch'].str.contains(c)*1
#Label enc
sh_mapper = {'http/1.1' :0,'h2':1}
data['tls.handshake.extensions_alpn_str.sh'] = data['tls.handshake.extensions_alpn_str.sh'].map(sh_mapper)
#Label Enc
format_ch_mapper = {"['0']" :0,"['0', '1', '2']":1}
data['tls.handshake.extensions_ec_point_format.ch'] = data['tls.handshake.extensions_ec_point_format.ch'].map(format_ch_mapper)
data['tls.handshake.extensions_ec_point_format.sh'] = data['tls.handshake.extensions_ec_point_format.sh'].map(format_ch_mapper)
#Label Enc
tls_handshake_mapper = {'TLSv1.2' :0,'TLSv1.3':1}
data['tls.handshake.version.sh'] = data['tls.handshake.version.sh'].map(tls_handshake_mapper)
#ohe
for c in tls_vers :
data[f'tls.record.version.ch_{c}'] = data['tls.record.version.ch'].str.contains(c)*1
return data
def get_tf_idf_feats(self,train,test) :
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=10,analyzer='char')
feats = vect.fit_transform(train['tls.cipher'])
train = pd.concat([train,pd.DataFrame(feats.todense(),columns=vect.get_feature_names())],1)
test = pd.concat([test,pd.DataFrame(vect.transform(test['tls.cipher']).todense(),columns=vect.get_feature_names())],1)
return train,test
def process(self,) :
self.train = reduce_mem_usage(self.train)
self.test = reduce_mem_usage(self.test)
logger.info(f"Processing Data")
self.train = self.process_list_cols(self.train)
self.test = self.process_list_cols(self.test)
logger.info(f"Added statistics about tcp tls and ip logs ")
self.train = self.process_flags_cols(self.train)
self.test = self.process_flags_cols(self.test)
logger.info(f"Added flags features ")
self.train = self.process_tcp_flags(self.train)
self.test = self.process_tcp_flags(self.test)
logger.info(f"Added tcp flags features ")
logger.info(f"Feature Engineering ")
self.train = self.FE(self.train)
self.test = self.FE(self.test)
self.train,self.test = self.get_tf_idf_feats(self.train,self.test)
logger.info(f"Added TF-IDF Features ")
# Get
self.train = reduce_mem_usage(self.train)
self.test = reduce_mem_usage(self.test)
return self.train, self.test
sns.countplot(data = train , x = train.Status)
<Axes: xlabel='Status', ylabel='count'>
n,m = train.shape
id = test.id
y = train.Status
num = ["Bilirubin","Cholesterol" , "Albumin","Copper","Alk_Phos","SGOT","Tryglicerides","Platelets","Prothrombin"]
train[num].std()
Bilirubin 3.812960 Cholesterol 195.379344 Albumin 0.346171 Copper 75.899266 Alk_Phos 1903.750657 SGOT 48.790945 Tryglicerides 52.530402 Platelets 87.465579 Prothrombin 0.781735 dtype: float64
train_copy = train[num]
for i in num:
plt.figure(figsize=(8, 6)) # Adjust the figure size as needed
sns.boxplot(data=train, x=i)
plt.title(f'Boxplot for {i}')
plt.show()
def days_to_years(age_in_days):
return age_in_days / 365.25
train["Age_y"] = days_to_years(train["Age"]).astype('int')
test["Age_y"] = days_to_years(test["Age"]).astype('int')
dict = {"D" : 0 , 'C':1 , 'CL' : 2}
# import itertools
# FEATS = ["N_Days",'Bilirubin',"Cholesterol","Albumin","Copper","Alk_Phos","Tryglicerides","SGOT","Platelets","Prothrombin","Age"]
# # Feature engineering
# def feature_engineering(df):
# # Create new columns representing pairwise sums, quotients, and products
# for feat1, feat2 in itertools.combinations(FEATS, 2):
# new_col_name = f'{feat1}_plus_{feat2}'
# df[new_col_name] = df[feat1] + df[feat2]
# new_col_name = f'{feat1}_div_{feat2}'
# df[new_col_name] = df[feat1] / df[feat2]
# new_col_name = f'{feat1}_times_{feat2}'
# df[new_col_name] = df[feat1] * df[feat2]
# return df
# train = feature_engineering(train)
# test = feature_engineering(test)
# print("Feature engineering complete...")
# threshold_platelets = 150
# train['thrombocytopenia'] = np.where(train['Platelets'] < threshold_platelets, 1, 0)
# test['thrombocytopenia'] = np.where(test['Platelets'] < threshold_platelets, 1, 0)
# threshold_alk_phos_upper = 147 # Upper limit of normal range
# threshold_alk_phos_lower = 44 # Lower limit of normal range
# train['elevated_alk_phos'] = np.where((train['Alk_Phos'] > threshold_alk_phos_upper) | (train['Alk_Phos'] < threshold_alk_phos_lower), 1, 0)
# test['elevated_alk_phos'] = np.where((test['Alk_Phos'] > threshold_alk_phos_upper) | (test['Alk_Phos'] < threshold_alk_phos_lower), 1, 0)
# normal_copper_range = (62, 140)
# train['normal_copper'] = np.where((train['Copper'] >= normal_copper_range[0]) & (train['Copper'] <= normal_copper_range[1]), 1, 0)
# test['normal_copper'] = np.where((test['Copper'] >= normal_copper_range[0]) & (test['Copper'] <= normal_copper_range[1]), 1, 0)
# normal_albumin_range = (3.4, 5.4)
# train['normal_albumin'] = np.where((train['Albumin'] >= normal_albumin_range[0]) & (train['Albumin'] <= normal_albumin_range[1]), 1, 0)
# test['normal_albumin'] = np.where((test['Albumin'] >= normal_albumin_range[0]) & (test['Albumin'] <= normal_albumin_range[1]), 1, 0)
# normal_bilirubin_range = (0.2, 1.2)
# train['normal_bilirubin'] = np.where((train['Bilirubin'] >= normal_bilirubin_range[0]) & (train['Bilirubin'] <= normal_bilirubin_range[1]), 1, 0)
# test['normal_bilirubin'] = np.where((test['Bilirubin'] >= normal_bilirubin_range[0]) & (test['Bilirubin'] <= normal_bilirubin_range[1]), 1, 0)
# train['DiagnosisDays'] = train['Age'] - train['N_Days']
# test['DiagnosisDays'] = test['Age'] - test['N_Days']
# train['Age_Group'] = pd.cut(train['Age_y'], bins=[19, 29, 49, 64, 99], labels = [0, 1, 2, 3]).astype('int16')
# test['Age_Group'] = pd.cut(test['Age_y'], bins=[19, 29, 49, 64, 99], labels = [0, 1, 2, 3]).astype('int16')
# train['Bilirubin_Albumin'] =train['Bilirubin'] *train['Albumin']
# test['Bilirubin_Albumin'] =test['Bilirubin'] *test['Albumin']
# train['Diag_Year'] = (train['N_Days'] / 365).astype(int)
# train['Diag_Month'] = ((train['N_Days'] % 365) / 30).astype(int)
# test['Diag_Year'] = (test['N_Days'] / 365).astype(int)
# test['Diag_Month'] = ((test['N_Days'] % 365) / 30).astype(int)
# train['Risk_Score'] = train['Bilirubin'] + train['Albumin'] - train['Alk_Phos']
# test['Risk_Score'] = test['Bilirubin'] + test['Albumin'] - test['Alk_Phos']
# liver_columns = ['Bilirubin', 'Albumin', 'Alk_Phos', 'SGOT']
# train['Liver_Function_Index'] = train[liver_columns].mean(axis=1)
# test['Liver_Function_Index'] = test[liver_columns].mean(axis=1)
AllData = pd.concat([train,test],axis=0).drop(['id'],axis=1)
numerical_features = [i for i in AllData.columns if train[i].dtype!='object' ]
num_copy = numerical_features.copy()
num_copy.remove('Stage')
num_copy.remove('N_Days')
num_copy.remove('Age')
AllData
| N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 999 | D-penicillamine | 21532 | M | N | N | N | N | 2.30 | 316.00 | 3.35 | 172.00 | 1601.00 | 179.80 | 63.00 | 394.00 | 9.70 | 3.00 | D |
| 1 | 2574 | Placebo | 19237 | F | N | N | N | N | 0.90 | 364.00 | 3.54 | 63.00 | 1440.00 | 134.85 | 88.00 | 361.00 | 11.00 | 3.00 | C |
| 2 | 3428 | Placebo | 13727 | F | N | Y | Y | Y | 3.30 | 299.00 | 3.55 | 131.00 | 1029.00 | 119.35 | 50.00 | 199.00 | 11.70 | 4.00 | D |
| 3 | 2576 | Placebo | 18460 | F | N | N | N | N | 0.60 | 256.00 | 3.50 | 58.00 | 1653.00 | 71.30 | 96.00 | 269.00 | 10.70 | 3.00 | C |
| 4 | 788 | Placebo | 16658 | F | N | Y | N | N | 1.10 | 346.00 | 3.65 | 63.00 | 1181.00 | 125.55 | 96.00 | 298.00 | 10.60 | 4.00 | C |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5266 | 2870 | Placebo | 12279 | F | N | N | N | N | 1.30 | 302.00 | 3.43 | 75.00 | 1345.00 | 145.00 | 44.00 | 181.00 | 10.60 | 3.00 | NaN |
| 5267 | 1770 | Placebo | 24803 | F | N | N | N | N | 0.50 | 219.00 | 4.09 | 121.00 | 663.00 | 79.05 | 94.00 | 311.00 | 9.70 | 3.00 | NaN |
| 5268 | 3707 | D-penicillamine | 16990 | F | N | Y | N | N | 0.80 | 315.00 | 4.09 | 13.00 | 1637.00 | 170.50 | 70.00 | 426.00 | 10.90 | 3.00 | NaN |
| 5269 | 1216 | Placebo | 11773 | F | N | N | N | N | 0.70 | 329.00 | 3.80 | 52.00 | 678.00 | 57.00 | 126.00 | 306.00 | 10.20 | 1.00 | NaN |
| 5270 | 2272 | D-penicillamine | 21600 | F | N | N | N | N | 2.00 | 232.00 | 3.42 | 18.00 | 1636.00 | 170.50 | 83.00 | 213.00 | 13.60 | 2.00 | NaN |
13176 rows × 19 columns
skewed = ['Bilirubin','Cholesterol',"Alk_Phos","Copper","Prothrombin",'SGOT','Tryglicerides','Albumin']
for i in skewed :
transformed_data, lambda_value = boxcox(AllData[i])
AllData[i] = transformed_data
train = AllData.iloc[:n]
test = AllData.iloc[n:]
def remove_outliers(data, threshold=3):
mean_value = np.mean(data)
std_dev = np.std(data)
# Define the lower and upper bounds for outliers
lower_bound = mean_value - threshold * std_dev
upper_bound = mean_value + threshold * std_dev
# Keep only the data points within the bounds
data_no_outliers = data[(data >= lower_bound) & (data <= upper_bound)]
return data_no_outliers
train[num] = train_copy.apply(remove_outliers)
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
encoders = {
'Drug': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, categories=[['Placebo', 'D-penicillamine']]),
'Sex': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
'Ascites': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
'Hepatomegaly': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
'Spiders': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
# 'Edema': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, categories=[['N', 'S', 'Y']]),
'Edema': OneHotEncoder(),
'Stage': OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
}
for feat, enc in encoders.items():
if isinstance(enc, OrdinalEncoder):
train[feat] = enc.fit_transform(train[[feat]]).astype('int32')
test[feat] = enc.transform(test[[feat]]).astype('int32')
if isinstance(enc, OneHotEncoder):
# Transform and get new column names
new_cols = enc.fit_transform(train[[feat]]).toarray().astype('int8')
# col_names = [f"{feat}_{cat}" for cat in enc.categories_[0]]
col_names = enc.get_feature_names_out()
# Add new columns to the dataframe
train[col_names] = new_cols
train.drop(feat, axis=1, inplace=True) # Drop original column
# Repeat for the test set
new_cols_test = enc.transform(test[[feat]]).toarray().astype('int8')
test[col_names] = new_cols_test
test.drop(feat, axis=1, inplace=True)
map_dict = {"Y" :1 , "N" : 0,"S" : -1}
Y_N_col = ['Ascites',"Hepatomegaly","Spiders","Edema"]
AllData[Y_N_col] = AllData[Y_N_col].apply(lambda x : x.map(map_dict))
train.columns
Index(['N_Days', 'Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders',
'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT',
'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage', 'Status', 'Age_y',
'Edema_N', 'Edema_S', 'Edema_Y'],
dtype='object')
train['Desease_count'] = np.abs(train['Ascites']) + np.abs(train['Hepatomegaly']) + np.abs(train['Spiders']) + np.abs(train['Edema_Y'])
test['Desease_count'] = np.abs(test['Ascites']) + np.abs(test['Hepatomegaly']) + np.abs(test['Spiders']) + np.abs(test['Edema_Y'])
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
class DiagnosisDateTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
X['Diagnosis_Date'] = X['Age'] - X['N_Days']
return X
class AgeYearsTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
X['Age_Years'] = round(X['Age'] / 365.25).astype("int16")
return X
class AgeGroupsTransformer(BaseEstimator, TransformerMixin):
"""Older people might be hit harder (interaction) by health issues. Also can cover lifestyle influences, i.e.
alcohol consumption etc."""
def fit(self, X, y=None):
return self
def transform(self, X):
# Use years from above, min=26, max=78
X['Age_Group'] = pd.cut(X['Age_Years'], bins=[19, 29, 49, 64, 99], labels = [0, 1, 2, 3]).astype('int16')
return X
class BilirubinAlbuminTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
X['Bilirubin_Albumin'] = X['Bilirubin'] * X['Albumin']
return X
class DrugEffectivenessTransformer(BaseEstimator, TransformerMixin):
# Placeholder concept, assuming 'Bilirubin' improvement is a measure of effectiveness
def fit(self, X, y=None):
return self
def transform(self, X):
X['Drug_Effectiveness'] = X['Drug'] * X['Bilirubin']
return X
class SymptomScoreTransformer(BaseEstimator, TransformerMixin):
# From data set explanations above let's add all the "bad" symptoms
def fit(self, X, y=None):
return self
def transform(self, X):
# symptom_columns = ['Ascites', 'Hepatomegaly', 'Spiders', 'Edema']
symptom_columns = ['Ascites', 'Hepatomegaly', 'Spiders', 'Edema_N', 'Edema_S', 'Edema_Y']
X['Symptom_Score'] = X[symptom_columns].sum(axis=1)
return X
class SymptomCatTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.symptom_columns = ['Ascites', 'Hepatomegaly', 'Spiders', 'Edema_N', 'Edema_S', 'Edema_Y']
self.encoder = OneHotEncoder(handle_unknown='ignore')
def fit(self, X, y=None):
X_copy = X.copy()
symptom_scores = X_copy[self.symptom_columns].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
self.encoder.fit(symptom_scores.values.reshape(-1, 1))
return self
def transform(self, X):
X_transformed = X.copy()
symptom_scores = X_transformed[self.symptom_columns].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
encoded_features = self.encoder.transform(symptom_scores.values.reshape(-1, 1)).toarray().astype("int8")
encoded_feature_names = self.encoder.get_feature_names_out(input_features=['Symptom_Score'])
# Drop the original symptom columns and add the new encoded features
# X_transformed.drop(columns=self.symptom_columns, inplace=True)
X_transformed[encoded_feature_names] = pd.DataFrame(encoded_features, index=X_transformed.index)
return X_transformed
class LiverFunctionTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
liver_columns = ['Bilirubin', 'Albumin', 'Alk_Phos', 'SGOT']
X['Liver_Function_Index'] = X[liver_columns].mean(axis=1)
return X
class RiskScoreTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
X['Risk_Score'] = X['Bilirubin'] + X['Albumin'] - X['Alk_Phos']
return X
class TimeFeaturesTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
X['Diag_Year'] = (X['N_Days'] / 365).astype(int)
X['Diag_Month'] = ((X['N_Days'] % 365) / 30).astype(int)
return X
class ScalingTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.scaler = StandardScaler()
self.num_feats = NUM_FEATS + ['Diagnosis_Date', 'Age_Years', 'Bilirubin_Albumin', 'Drug_Effectiveness',
'Symptom_Score', 'Liver_Function_Index', 'Risk_Score', 'Diag_Year', 'Diag_Month']
def fit(self, X, y=None):
self.scaler.fit(X[self.num_feats])
return self
def transform(self, X):
X_scaled = X.copy()
X_scaled[self.num_feats] = self.scaler.transform(X_scaled[self.num_feats])
return X_scaled
# Define the pipeline
pipeline = Pipeline([
('diagnosis_date', DiagnosisDateTransformer()),
('age_years', AgeYearsTransformer()),
('age_groups', AgeGroupsTransformer()),
('bilirubin_albumin', BilirubinAlbuminTransformer()),
('drug_effectiveness', DrugEffectivenessTransformer()),
('symptom_score', SymptomScoreTransformer()),
('symptom_cat_score', SymptomCatTransformer()),
('liver_function', LiverFunctionTransformer()),
('risk_score', RiskScoreTransformer()),
('time_features', TimeFeaturesTransformer()),
#('scaling', ScalingTransformer()),
# ... ?
])
# Apply the pipeline to your dataframes
train = pipeline.fit_transform(train)
test = pipeline.transform(test)
# Update the CAT_FEATS
# CAT_FEATS = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage', #old
# 'Age_Group', 'Symptom_Score'] # new
# # Update the NUM_FEATS ????
AllData = pd.concat([train,test],axis=0)
group_by_cols = ['Stage' , 'Drug' , 'Sex' , 'Ascites','Hepatomegaly','Spiders',]
aggregation_strategies = ['mean' , 'max' , 'min' , 'std']
cols_to_agg = ["Bilirubin","N_Days","Cholesterol","Albumin","Copper","Alk_Phos","SGOT","Tryglicerides","Platelets","Prothrombin"]
sep = train.shape[0]
for col_to_agg in cols_to_agg:
for col in group_by_cols:
for strategy in aggregation_strategies:
AllData[f'{col_to_agg} {strategy} by {col}'] = AllData.groupby(col)[col_to_agg].transform(strategy)
AllData.drop(group_by_cols,axis=1,inplace=True)
# encoder = OrdinalEncoder()
# df[group_by_cols] = encoder.fit_transform(df[group_by_cols])
train = AllData[: sep]
test = AllData[sep :].drop('Status',axis=1)
X = train.drop("Status" , axis = 1 )
y = y.map(dict)
X_train ,X_test , y_train,y_test = train_test_split(X,y,test_size=0.2)
LazyClassifier().fit(X_train ,X_test , y_train,y_test)
97%|███████████████████████████████████████████████████████████████████████████████▏ | 28/29 [00:31<00:00, 1.79it/s]
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000371 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 3163 [LightGBM] [Info] Number of data points in the train set: 6324, number of used features: 45 [LightGBM] [Info] Start training from score -1.101463 [LightGBM] [Info] Start training from score -0.458308 [LightGBM] [Info] Start training from score -3.344935
100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:31<00:00, 1.09s/it]
( Accuracy Balanced Accuracy ROC AUC F1 Score \
Model
LGBMClassifier 0.83 0.62 None 0.82
NearestCentroid 0.68 0.62 None 0.73
XGBClassifier 0.83 0.61 None 0.82
AdaBoostClassifier 0.81 0.60 None 0.80
BaggingClassifier 0.81 0.59 None 0.80
RandomForestClassifier 0.83 0.59 None 0.81
BernoulliNB 0.72 0.57 None 0.74
ExtraTreesClassifier 0.82 0.57 None 0.80
PassiveAggressiveClassifier 0.73 0.56 None 0.75
LabelSpreading 0.74 0.55 None 0.74
DecisionTreeClassifier 0.73 0.55 None 0.73
LabelPropagation 0.73 0.55 None 0.73
KNeighborsClassifier 0.78 0.53 None 0.77
SGDClassifier 0.80 0.53 None 0.78
LogisticRegression 0.80 0.53 None 0.78
SVC 0.80 0.53 None 0.78
LinearSVC 0.79 0.52 None 0.78
Perceptron 0.77 0.52 None 0.76
CalibratedClassifierCV 0.79 0.52 None 0.78
LinearDiscriminantAnalysis 0.79 0.52 None 0.77
RidgeClassifier 0.79 0.52 None 0.77
RidgeClassifierCV 0.79 0.52 None 0.77
ExtraTreeClassifier 0.70 0.50 None 0.70
GaussianNB 0.13 0.40 None 0.17
QuadraticDiscriminantAnalysis 0.19 0.39 None 0.26
DummyClassifier 0.61 0.33 None 0.46
Time Taken
Model
LGBMClassifier 0.36
NearestCentroid 0.03
XGBClassifier 0.36
AdaBoostClassifier 0.84
BaggingClassifier 1.58
RandomForestClassifier 2.62
BernoulliNB 0.05
ExtraTreesClassifier 1.24
PassiveAggressiveClassifier 0.06
LabelSpreading 2.49
DecisionTreeClassifier 0.21
LabelPropagation 2.13
KNeighborsClassifier 0.19
SGDClassifier 0.16
LogisticRegression 0.10
SVC 1.80
LinearSVC 2.81
Perceptron 0.05
CalibratedClassifierCV 14.14
LinearDiscriminantAnalysis 0.07
RidgeClassifier 0.03
RidgeClassifierCV 0.05
ExtraTreeClassifier 0.03
GaussianNB 0.04
QuadraticDiscriminantAnalysis 0.04
DummyClassifier 0.02 ,
Accuracy Balanced Accuracy ROC AUC F1 Score \
Model
LGBMClassifier 0.83 0.62 None 0.82
NearestCentroid 0.68 0.62 None 0.73
XGBClassifier 0.83 0.61 None 0.82
AdaBoostClassifier 0.81 0.60 None 0.80
BaggingClassifier 0.81 0.59 None 0.80
RandomForestClassifier 0.83 0.59 None 0.81
BernoulliNB 0.72 0.57 None 0.74
ExtraTreesClassifier 0.82 0.57 None 0.80
PassiveAggressiveClassifier 0.73 0.56 None 0.75
LabelSpreading 0.74 0.55 None 0.74
DecisionTreeClassifier 0.73 0.55 None 0.73
LabelPropagation 0.73 0.55 None 0.73
KNeighborsClassifier 0.78 0.53 None 0.77
SGDClassifier 0.80 0.53 None 0.78
LogisticRegression 0.80 0.53 None 0.78
SVC 0.80 0.53 None 0.78
LinearSVC 0.79 0.52 None 0.78
Perceptron 0.77 0.52 None 0.76
CalibratedClassifierCV 0.79 0.52 None 0.78
LinearDiscriminantAnalysis 0.79 0.52 None 0.77
RidgeClassifier 0.79 0.52 None 0.77
RidgeClassifierCV 0.79 0.52 None 0.77
ExtraTreeClassifier 0.70 0.50 None 0.70
GaussianNB 0.13 0.40 None 0.17
QuadraticDiscriminantAnalysis 0.19 0.39 None 0.26
DummyClassifier 0.61 0.33 None 0.46
Time Taken
Model
LGBMClassifier 0.36
NearestCentroid 0.03
XGBClassifier 0.36
AdaBoostClassifier 0.84
BaggingClassifier 1.58
RandomForestClassifier 2.62
BernoulliNB 0.05
ExtraTreesClassifier 1.24
PassiveAggressiveClassifier 0.06
LabelSpreading 2.49
DecisionTreeClassifier 0.21
LabelPropagation 2.13
KNeighborsClassifier 0.19
SGDClassifier 0.16
LogisticRegression 0.10
SVC 1.80
LinearSVC 2.81
Perceptron 0.05
CalibratedClassifierCV 14.14
LinearDiscriminantAnalysis 0.07
RidgeClassifier 0.03
RidgeClassifierCV 0.05
ExtraTreeClassifier 0.03
GaussianNB 0.04
QuadraticDiscriminantAnalysis 0.04
DummyClassifier 0.02 )
model2 = XGBClassifier().fit(X_train,y_train)
feature_importances = model2.feature_importances_
feature_names = model2.feature_names_in_
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('XGBoost - Feature Importances')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming you have already calculated feature importances and created feature_importance_df
# Filter features with importance greater than 0
X = X[selected_features.Feature]
test = test[selected_features.Feature]
X_train,X_test , y_train,y_test = train_test_split(X, y ,test_size= 0.2)
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import numpy as np
# Assuming you have X_train, y_train for your multiclass classification problem
# Define the objective function for Optuna
def objective(trial):
# Split the data into training and validation sets
X_valid = X_test
y_valid = y_test
# Define the XGBoost parameters to be optimized
params = {
'objective': 'multi:softmax', # For multiclass classification
'num_class': len(np.unique(y_train)), # Number of classes
'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'max_depth': trial.suggest_int('max_depth', 3, 10),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
'lambda': trial.suggest_float('lambda', 1e-4, 1.0),
'alpha': trial.suggest_float('alpha', 1e-4, 1.0),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
'random_state': 42,
'n_jobs': -1
}
# Create and train the XGBoost model
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)
# Predict on the validation set
y_pred = model.predict_proba(X_valid)
# Calculate log loss
loss = log_loss(y_valid, y_pred)
return loss
# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
# Print the best parameters and their values
print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial
print('Value: ', trial.value)
print('Params: ')
for key, value in trial.params.items():
print(f' {key}: {value}')
[I 2023-12-18 19:35:32,012] A new study created in memory with name: no-name-3088f359-7138-4803-9937-2aad132cfa77
[I 2023-12-18 19:35:46,029] Trial 0 finished with value: 0.467901439351651 and parameters: {'booster': 'dart', 'learning_rate': 0.12746943553980886, 'max_depth': 10, 'subsample': 0.6346355421802843, 'colsample_bytree': 0.816641272904935, 'lambda': 0.037217420773450346, 'alpha': 0.6762667042644042, 'min_child_weight': 6}. Best is trial 0 with value: 0.467901439351651.
[I 2023-12-18 19:35:46,189] Trial 1 finished with value: 0.6037975063023779 and parameters: {'booster': 'gblinear', 'learning_rate': 0.2991729870211477, 'max_depth': 10, 'subsample': 0.94887532050163, 'colsample_bytree': 0.6405892872125224, 'lambda': 0.312355003411337, 'alpha': 0.3507935732407453, 'min_child_weight': 2}. Best is trial 0 with value: 0.467901439351651.
[I 2023-12-18 19:35:46,541] Trial 2 finished with value: 0.46298152684355603 and parameters: {'booster': 'gbtree', 'learning_rate': 0.26953782341788446, 'max_depth': 5, 'subsample': 0.892787988701748, 'colsample_bytree': 0.9460900209863112, 'lambda': 0.48339904513109605, 'alpha': 0.24895141551110403, 'min_child_weight': 6}. Best is trial 2 with value: 0.46298152684355603.
[I 2023-12-18 19:35:47,001] Trial 3 finished with value: 0.468967842953452 and parameters: {'booster': 'gbtree', 'learning_rate': 0.1968570716692036, 'max_depth': 7, 'subsample': 0.6968008794555788, 'colsample_bytree': 0.9473767444269642, 'lambda': 0.9425810072775328, 'alpha': 0.47998117466167245, 'min_child_weight': 9}. Best is trial 2 with value: 0.46298152684355603.
[I 2023-12-18 19:35:47,742] Trial 4 finished with value: 0.553333358347255 and parameters: {'booster': 'gbtree', 'learning_rate': 0.22217501232681033, 'max_depth': 9, 'subsample': 0.5800458884599016, 'colsample_bytree': 0.9774549778662086, 'lambda': 0.23504152818580556, 'alpha': 0.21660861802534212, 'min_child_weight': 2}. Best is trial 2 with value: 0.46298152684355603.
[I 2023-12-18 19:35:48,368] Trial 5 finished with value: 0.5317804045845647 and parameters: {'booster': 'gbtree', 'learning_rate': 0.018117294927103383, 'max_depth': 7, 'subsample': 0.8699289173492628, 'colsample_bytree': 0.9189739682388844, 'lambda': 0.25347600798975467, 'alpha': 0.6081935011695926, 'min_child_weight': 6}. Best is trial 2 with value: 0.46298152684355603.
[I 2023-12-18 19:35:48,533] Trial 6 finished with value: 0.604705510005819 and parameters: {'booster': 'gblinear', 'learning_rate': 0.1425963716034376, 'max_depth': 3, 'subsample': 0.7847696219414522, 'colsample_bytree': 0.9313012161103533, 'lambda': 0.0808138985128455, 'alpha': 0.14907754293911185, 'min_child_weight': 1}. Best is trial 2 with value: 0.46298152684355603.
[I 2023-12-18 19:36:02,829] Trial 7 finished with value: 0.4455269381351693 and parameters: {'booster': 'dart', 'learning_rate': 0.06401154936301488, 'max_depth': 5, 'subsample': 0.8219892269754658, 'colsample_bytree': 0.7577975087536928, 'lambda': 0.5601645798142655, 'alpha': 0.37497662789569375, 'min_child_weight': 5}. Best is trial 7 with value: 0.4455269381351693.
[I 2023-12-18 19:36:17,008] Trial 8 finished with value: 0.4471619168379527 and parameters: {'booster': 'dart', 'learning_rate': 0.18518614626695104, 'max_depth': 4, 'subsample': 0.5772833539917517, 'colsample_bytree': 0.8790190343647026, 'lambda': 0.014618704252580753, 'alpha': 0.7750933280799106, 'min_child_weight': 6}. Best is trial 7 with value: 0.4455269381351693.
[I 2023-12-18 19:36:31,410] Trial 9 finished with value: 0.4502961995945259 and parameters: {'booster': 'dart', 'learning_rate': 0.06623971620335108, 'max_depth': 10, 'subsample': 0.8259893721747179, 'colsample_bytree': 0.538621853070705, 'lambda': 0.7644821198823564, 'alpha': 0.04035767654397569, 'min_child_weight': 8}. Best is trial 7 with value: 0.4455269381351693.
[I 2023-12-18 19:36:45,577] Trial 10 finished with value: 0.4394526359630778 and parameters: {'booster': 'dart', 'learning_rate': 0.09714866603753744, 'max_depth': 5, 'subsample': 0.9983406881809331, 'colsample_bytree': 0.7233332089220278, 'lambda': 0.6518618137378873, 'alpha': 0.9956815364834396, 'min_child_weight': 4}. Best is trial 10 with value: 0.4394526359630778.
[I 2023-12-18 19:36:59,174] Trial 11 finished with value: 0.4420806868129429 and parameters: {'booster': 'dart', 'learning_rate': 0.07513331116676263, 'max_depth': 5, 'subsample': 0.9758312511857521, 'colsample_bytree': 0.7334186153904888, 'lambda': 0.630752397615246, 'alpha': 0.9445299618185905, 'min_child_weight': 4}. Best is trial 10 with value: 0.4394526359630778.
[I 2023-12-18 19:37:14,029] Trial 12 finished with value: 0.43921844570171875 and parameters: {'booster': 'dart', 'learning_rate': 0.10401893859068073, 'max_depth': 6, 'subsample': 0.9969018827861796, 'colsample_bytree': 0.6923568436041777, 'lambda': 0.6648439228201306, 'alpha': 0.9902308244863821, 'min_child_weight': 4}. Best is trial 12 with value: 0.43921844570171875.
[I 2023-12-18 19:37:28,516] Trial 13 finished with value: 0.44245942822750606 and parameters: {'booster': 'dart', 'learning_rate': 0.11324890072656754, 'max_depth': 6, 'subsample': 0.995416191138221, 'colsample_bytree': 0.6738320949753753, 'lambda': 0.7540313969577961, 'alpha': 0.9457505113045466, 'min_child_weight': 4}. Best is trial 12 with value: 0.43921844570171875.
[I 2023-12-18 19:37:42,910] Trial 14 finished with value: 0.4621930920197788 and parameters: {'booster': 'dart', 'learning_rate': 0.1107416117233408, 'max_depth': 8, 'subsample': 0.928078684290883, 'colsample_bytree': 0.6090522222998729, 'lambda': 0.47879344029572646, 'alpha': 0.9932511796552741, 'min_child_weight': 3}. Best is trial 12 with value: 0.43921844570171875.
[I 2023-12-18 19:37:57,967] Trial 15 finished with value: 0.4474133569466753 and parameters: {'booster': 'dart', 'learning_rate': 0.16567599644396172, 'max_depth': 6, 'subsample': 0.9894364006758499, 'colsample_bytree': 0.7156396424950335, 'lambda': 0.7066524721845684, 'alpha': 0.8202046828158043, 'min_child_weight': 4}. Best is trial 12 with value: 0.43921844570171875.
[I 2023-12-18 19:37:58,148] Trial 16 finished with value: 0.6316962115326096 and parameters: {'booster': 'gblinear', 'learning_rate': 0.08711852917187111, 'max_depth': 3, 'subsample': 0.9049839106948524, 'colsample_bytree': 0.8025284576951568, 'lambda': 0.9638207834509299, 'alpha': 0.8538986216501384, 'min_child_weight': 10}. Best is trial 12 with value: 0.43921844570171875.
[I 2023-12-18 19:38:12,686] Trial 17 finished with value: 0.5068721519517998 and parameters: {'booster': 'dart', 'learning_rate': 0.02562611100062981, 'max_depth': 4, 'subsample': 0.7374955773614293, 'colsample_bytree': 0.6812435384955904, 'lambda': 0.8447882061363468, 'alpha': 0.7104241446438149, 'min_child_weight': 8}. Best is trial 12 with value: 0.43921844570171875.
[I 2023-12-18 19:38:27,084] Trial 18 finished with value: 0.46975695094155423 and parameters: {'booster': 'dart', 'learning_rate': 0.14460317257947905, 'max_depth': 8, 'subsample': 0.9423238114748542, 'colsample_bytree': 0.5885460205993664, 'lambda': 0.6251247753398984, 'alpha': 0.8795546103233672, 'min_child_weight': 3}. Best is trial 12 with value: 0.43921844570171875.
[I 2023-12-18 19:38:27,273] Trial 19 finished with value: 0.629388936566327 and parameters: {'booster': 'gblinear', 'learning_rate': 0.11380824652881756, 'max_depth': 6, 'subsample': 0.8663594374183559, 'colsample_bytree': 0.7854091512126957, 'lambda': 0.8508191849820892, 'alpha': 0.9865412812063501, 'min_child_weight': 5}. Best is trial 12 with value: 0.43921844570171875.
[I 2023-12-18 19:38:41,654] Trial 20 finished with value: 0.46607702807814266 and parameters: {'booster': 'dart', 'learning_rate': 0.0442070376743758, 'max_depth': 4, 'subsample': 0.9411599119909108, 'colsample_bytree': 0.5076933477492662, 'lambda': 0.41763765780594386, 'alpha': 0.7633907482071974, 'min_child_weight': 7}. Best is trial 12 with value: 0.43921844570171875.
[I 2023-12-18 19:38:56,333] Trial 21 finished with value: 0.4426561686027803 and parameters: {'booster': 'dart', 'learning_rate': 0.08473267509648494, 'max_depth': 5, 'subsample': 0.9981744676865851, 'colsample_bytree': 0.7268553158627814, 'lambda': 0.6181456391660245, 'alpha': 0.8947372896442928, 'min_child_weight': 4}. Best is trial 12 with value: 0.43921844570171875.
[I 2023-12-18 19:39:10,268] Trial 22 finished with value: 0.4387168740928328 and parameters: {'booster': 'dart', 'learning_rate': 0.09748262983754383, 'max_depth': 5, 'subsample': 0.9710653610936847, 'colsample_bytree': 0.7046711478715056, 'lambda': 0.6461469944331844, 'alpha': 0.989009892940536, 'min_child_weight': 3}. Best is trial 22 with value: 0.4387168740928328.
[I 2023-12-18 19:39:25,689] Trial 23 finished with value: 0.4504070528951304 and parameters: {'booster': 'dart', 'learning_rate': 0.09474700925189229, 'max_depth': 7, 'subsample': 0.956168731989341, 'colsample_bytree': 0.6775801163040833, 'lambda': 0.7183133953251563, 'alpha': 0.8856805130297458, 'min_child_weight': 3}. Best is trial 22 with value: 0.4387168740928328.
[I 2023-12-18 19:39:40,363] Trial 24 finished with value: 0.4519329755242491 and parameters: {'booster': 'dart', 'learning_rate': 0.05132492278698467, 'max_depth': 6, 'subsample': 0.9083787931721038, 'colsample_bytree': 0.763918252688886, 'lambda': 0.5379270236495117, 'alpha': 0.9713820831248559, 'min_child_weight': 1}. Best is trial 22 with value: 0.4387168740928328.
[I 2023-12-18 19:39:54,832] Trial 25 finished with value: 0.4426732569935103 and parameters: {'booster': 'dart', 'learning_rate': 0.09905215127466048, 'max_depth': 4, 'subsample': 0.9574814336764771, 'colsample_bytree': 0.7019388853936809, 'lambda': 0.6482937428738084, 'alpha': 0.7900471797463317, 'min_child_weight': 2}. Best is trial 22 with value: 0.4387168740928328.
[I 2023-12-18 19:40:09,052] Trial 26 finished with value: 0.4394620463877227 and parameters: {'booster': 'dart', 'learning_rate': 0.13041251799441955, 'max_depth': 5, 'subsample': 0.9932974327043721, 'colsample_bytree': 0.641671423409493, 'lambda': 0.5770069435779102, 'alpha': 0.9998628691744464, 'min_child_weight': 5}. Best is trial 22 with value: 0.4387168740928328.
[I 2023-12-18 19:40:23,845] Trial 27 finished with value: 0.45148806423476734 and parameters: {'booster': 'dart', 'learning_rate': 0.07792708214629346, 'max_depth': 8, 'subsample': 0.9258464613597038, 'colsample_bytree': 0.8289379921512159, 'lambda': 0.6807325204530122, 'alpha': 0.8888730191604699, 'min_child_weight': 3}. Best is trial 22 with value: 0.4387168740928328.
[I 2023-12-18 19:40:24,335] Trial 28 finished with value: 0.4577889685665366 and parameters: {'booster': 'gbtree', 'learning_rate': 0.04019005353248907, 'max_depth': 6, 'subsample': 0.8866388780300798, 'colsample_bytree': 0.7727188267725985, 'lambda': 0.7985549687778268, 'alpha': 0.8228925956150835, 'min_child_weight': 4}. Best is trial 22 with value: 0.4387168740928328.
[I 2023-12-18 19:40:24,535] Trial 29 finished with value: 0.6258411623905882 and parameters: {'booster': 'gblinear', 'learning_rate': 0.1230665111221104, 'max_depth': 4, 'subsample': 0.9621530426609177, 'colsample_bytree': 0.7402705876338667, 'lambda': 0.6952247137716429, 'alpha': 0.6623240856405609, 'min_child_weight': 5}. Best is trial 22 with value: 0.4387168740928328.
[I 2023-12-18 19:40:39,298] Trial 30 finished with value: 0.4512572866739116 and parameters: {'booster': 'dart', 'learning_rate': 0.09907482739672191, 'max_depth': 7, 'subsample': 0.5117261460658773, 'colsample_bytree': 0.8302793199804653, 'lambda': 0.5683541569798679, 'alpha': 0.7394600251050194, 'min_child_weight': 2}. Best is trial 22 with value: 0.4387168740928328.
[I 2023-12-18 19:40:53,511] Trial 31 finished with value: 0.43987648225042597 and parameters: {'booster': 'dart', 'learning_rate': 0.1372664551876506, 'max_depth': 5, 'subsample': 0.9892848164159387, 'colsample_bytree': 0.6563127862287073, 'lambda': 0.5709686207733861, 'alpha': 0.9988333868754334, 'min_child_weight': 5}. Best is trial 22 with value: 0.4387168740928328.
[I 2023-12-18 19:41:07,550] Trial 32 finished with value: 0.43806385797710357 and parameters: {'booster': 'dart', 'learning_rate': 0.12534520470092694, 'max_depth': 5, 'subsample': 0.9967398584994234, 'colsample_bytree': 0.6466917057968393, 'lambda': 0.42819438565674883, 'alpha': 0.9262582478094988, 'min_child_weight': 7}. Best is trial 32 with value: 0.43806385797710357.
[I 2023-12-18 19:41:21,750] Trial 33 finished with value: 0.4405562684137925 and parameters: {'booster': 'dart', 'learning_rate': 0.12329669755176835, 'max_depth': 5, 'subsample': 0.9630309766491958, 'colsample_bytree': 0.7091839237473331, 'lambda': 0.40841548907495945, 'alpha': 0.9289515047298709, 'min_child_weight': 7}. Best is trial 32 with value: 0.43806385797710357.
[W 2023-12-18 19:41:22,994] Trial 34 failed with parameters: {'booster': 'dart', 'learning_rate': 0.1046316520251173, 'max_depth': 6, 'subsample': 0.9240501401593115, 'colsample_bytree': 0.6226575295516056, 'lambda': 0.48299441560410866, 'alpha': 0.9274487250155293, 'min_child_weight': 7} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
File "C:\Anaconda\Lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
value_or_values = func(trial)
^^^^^^^^^^^
File "C:\Users\user\AppData\Local\Temp\ipykernel_10052\3999089154.py", line 33, in objective
model.fit(X_train, y_train)
File "C:\Anaconda\Lib\site-packages\xgboost\core.py", line 729, in inner_f
return func(**kwargs)
^^^^^^^^^^^^^^
File "C:\Anaconda\Lib\site-packages\xgboost\sklearn.py", line 1515, in fit
self._Booster = train(
^^^^^^
File "C:\Anaconda\Lib\site-packages\xgboost\core.py", line 729, in inner_f
return func(**kwargs)
^^^^^^^^^^^^^^
File "C:\Anaconda\Lib\site-packages\xgboost\training.py", line 181, in train
bst.update(dtrain, i, obj)
File "C:\Anaconda\Lib\site-packages\xgboost\core.py", line 2050, in update
_LIB.XGBoosterUpdateOneIter(
KeyboardInterrupt
[W 2023-12-18 19:41:23,021] Trial 34 failed with value None.
KeyboardInterrupt
xgb_params ={'max_depth': 10,
'min_child_weight': 7,
'learning_rate': 0.03419253503641095,
'n_estimators': 472,
'subsample': 0.8843005833909504,
'colsample_bytree': 0.0966352677605082,
'random_state': 42,
'tree_method': 'hist',
'eval_metric': 'mlogloss',
'device' : 'cuda',
'verbosity': 2, }
model3 = XGBClassifier(**xgb_params).fit(X_train,y_train)
y_hat = model3.predict_proba(X_test)
classes = [0, 1, 2]
logloss = log_loss(y_test, y_hat, labels=classes)
print("Log Loss:", logloss)
#Log Loss: 0.4349277405119213
#Log Loss: 0.4323600045757931
#Log Loss: 0.4292475293903112
Log Loss: 0.4329640144159542
from lightgbm import LGBMClassifier
lgbm_params = {
'max_depth': 9, 'min_child_samples': 14,
'learning_rate': 0.034869481921747415,
'n_estimators': 274, 'min_child_weight': 9,
'subsample': 0.7717873512945741,
'colsample_bytree': 0.1702910221565107,
'reg_alpha': 0.10626128775335533,
'reg_lambda': 0.624196407787772,
'random_state': 42}
model = LGBMClassifier(**lgbm_params).fit(X_train,y_train)
[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31). [LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31). [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000735 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 3106 [LightGBM] [Info] Number of data points in the train set: 6324, number of used features: 35 [LightGBM] [Info] Start training from score -1.084013 [LightGBM] [Info] Start training from score -0.464579 [LightGBM] [Info] Start training from score -3.400249 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
y_hat = model.predict_proba(X_test)
classes = [0, 1, 2]
logloss = log_loss(y_test, y_hat, labels=classes)
print("Log Loss:", logloss)
# Log Loss: 0.4329097797154753
#Mean Over all folds : 0.43297852006166754
[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31). Log Loss: 0.4379695263416751
# model = XGBClassifier(**xgb_params)
# cv = KFold(n_splits=5, shuffle=True, random_state=42)
# val_logloss = []
# models = []
# for i, (train_index,test_index) in enumerate(cv.split(X,y)):
# X_train , X_val = X.iloc[train_index], X.iloc[test_index]
# y_train, y_val = y.iloc[train_index], y.iloc[test_index]
# model.fit(X_train,y_train,verbose=False)
# models.append(model)
# train_preds= model.predict_proba(X_train)
# val_preds= model.predict_proba(X_val)
# train_logloss = log_loss(y_train , train_preds)
# test_logloss = log_loss(y_val , val_preds)
# val_logloss.append(test_logloss)
# print(f'Fold {i+1} \n')
# print(f'Train logloss: {train_logloss}')
# print(f'Validation logloss: {test_logloss}')
# print('-------------------\n')
# print(f"Mean Over all folds : {np.mean(val_logloss)}")
# model = LGBMClassifier(**lgbm_params)
# cv = KFold(n_splits=5, shuffle=True, random_state=42)
# val_logloss = []
# models = []
# for i, (train_index,test_index) in enumerate(cv.split(X,y)):
# X_train , X_val = X.iloc[train_index], X.iloc[test_index]
# y_train, y_val = y.iloc[train_index], y.iloc[test_index]
# model.fit(X_train,y_train)
# models.append(model)
# train_preds= model.predict_proba(X_train)
# val_preds= model.predict_proba(X_val)
# train_logloss = log_loss(y_train , train_preds)
# test_logloss = log_loss(y_val , val_preds)
# val_logloss.append(test_logloss)
# print(f'Fold {i+1} \n')
# print(f'Train logloss: {train_logloss}')
# print(f'Validation logloss: {test_logloss}')
# print('-------------------\n')
# print(f"Mean Over all folds : {np.mean(val_logloss)}")
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier([('lgbm' , model) , ('xgb' , model3) ],voting = 'soft').fit(X_train,y_train)
y_hat = voting.predict_proba(X_test)
classes = [0, 1, 2]
logloss = log_loss(y_test, y_hat, labels=classes)
print("Log Loss:", logloss)
# Log Loss: 0.4256159895204896
encoder = OrdinalEncoder()
Ord_enco = ['Drug',"Sex"]
AllData[Ord_enco] = encoder.fit_transform(AllData[Ord_enco])
train = AllData.iloc[:n]
test = AllData.iloc[n:]
X = train.drop("Status" , axis = 1 )
y = y.map(dict)
X_train ,X_test , y_train,y_test = train_test_split(X,y,test_size=0.2)
X
LazyClassifier().fit(X_train ,X_test , y_train,y_test)
from lightgbm import LGBMClassifier
lgbm_params = {
'max_depth': 9, 'min_child_samples': 14,
'learning_rate': 0.034869481921747415,
'n_estimators': 274, 'min_child_weight': 9,
'subsample': 0.7717873512945741,
'colsample_bytree': 0.1702910221565107,
'reg_alpha': 0.10626128775335533,
'reg_lambda': 0.624196407787772,
'random_state': 42}
model = LGBMClassifier(**lgbm_params).fit(X_train,y_train)
y_hat = model.predict_proba(X_test)
classes = [0, 1, 2]
logloss = log_loss(y_test, y_hat, labels=classes)
print("Log Loss:", logloss)
# feature_importances = model.feature_importances_
# feature_names = model.feature_name_
# feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
# feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# # Plot feature importances
# plt.figure(figsize=(10, 6))
# sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
# plt.title('LGBM - Feature Importances')
# plt.show()
# selected_features = feature_importance_df[feature_importance_df['Importance'] > 30]
# # Plot feature importances
# plt.figure(figsize=(10, 6))
# sns.barplot(x='Importance', y='Feature', data=selected_features, palette='viridis')
# plt.title('LGBM - Selected Features with Importance > 30')
# plt.show()
# selected_features.Feature
# original_list = selected_features.Feature
# modified_list = [string.replace('_', ' ') for string in original_list]
# X_copy = X[['N_Days', 'Alk_Phos', 'Tryglicerides', 'Albumin', 'Bilirubin', 'Prothrombin', 'Age', 'Platelets', 'Cholesterol', 'SGOT', 'Copper', 'Age_y', 'Desease_count', 'N_Days mean by Stage', 'Cholesterol mean by Stage', 'Bilirubin mean by Drug', 'Copper std by Stage', 'N_Days mean by Drug', 'Alk_Phos std by Stage', 'Bilirubin std by Hepatomegaly', 'Bilirubin mean by Stage', 'Albumin std by Stage', 'Bilirubin std by Spiders', 'Cholesterol mean by Drug', 'Prothrombin max by Stage', 'Bilirubin std by Stage', 'Cholesterol std by Stage', 'SGOT std by Stage', 'Cholesterol std by Spiders', 'N_Days std by Spiders', 'N_Days std by Stage', 'Albumin std by Drug', 'N_Days mean by Sex', 'N_Days mean by Hepatomegaly', 'Tryglicerides std by Stage', 'Albumin mean by Spiders', 'Bilirubin mean by Sex', 'Albumin mean by Stage', 'Bilirubin mean by Hepatomegaly', 'SGOT mean by Stage', 'Cholesterol std by Hepatomegaly', 'N_Days std by Drug', 'Cholesterol std by Drug', 'N_Days std by Sex', 'Albumin std by Spiders', 'Copper std by Drug', 'Albumin mean by Hepatomegaly', 'Platelets mean by Stage', 'Bilirubin std by Edema', 'Prothrombin mean by Stage', 'N_Days max by Edema', 'Bilirubin std by Drug', 'Cholesterol max by Edema', 'Albumin mean by Drug', 'Copper mean by Hepatomegaly', 'Albumin max by Stage', 'Bilirubin std by Sex', 'Alk_Phos std by Drug', 'Cholesterol mean by Spiders', 'Bilirubin mean by Ascites', 'Copper mean by Spiders', 'Cholesterol std by Edema', 'Prothrombin min by Stage']]
# X_train_copy,X_test_copy , y_train_copy,y_test_copy = train_test_split(X_copy, y ,test_size= 0.2)
# from lightgbm import LGBMClassifier
# lgbm_params = {
# 'max_depth': 9, 'min_child_samples': 14,
# 'learning_rate': 0.034869481921747415,
# 'n_estimators': 274, 'min_child_weight': 9,
# 'subsample': 0.7717873512945741,
# 'colsample_bytree': 0.1702910221565107,
# 'reg_alpha': 0.10626128775335533,
# 'reg_lambda': 0.624196407787772,
# 'random_state': 42}
# model = LGBMClassifier(**lgbm_params).fit(X_train_copy,y_train_copy)
# y_hat = model.predict_proba(X_test_copy)
# classes = [0, 1, 2]
# logloss = log_loss(y_test, y_hat, labels=classes)
# print("Log Loss:", logloss)
# Fit the model (internally splits the data into training and testing sets)
# LazyClassifier().fit(X_train, y_train,y_train , y_test)
model2 = XGBClassifier().fit(X_train,y_train)
feature_importances = model2.feature_importances_
feature_names = model2.feature_names_in_
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('XGBoost - Feature Importances')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming you have already calculated feature importances and created feature_importance_df
# Filter features with importance greater than 0
selected_features = feature_importance_df[feature_importance_df['Importance'] > 0]
# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=selected_features, palette='viridis')
plt.title('XGB - Selected Features with Importance > 0')
plt.show()
X = X[selected_features.Feature]
test = test[selected_features.Feature]
X_train,X_test , y_train,y_test = train_test_split(X, y ,test_size= 0.2)
# import optuna
# import xgboost as xgb
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import log_loss
# import numpy as np
# # Assuming you have X_train, y_train for your multiclass classification problem
# # Define the objective function for Optuna
# def objective(trial):
# # Split the data into training and validation sets
# X_valid = X_test
# y_valid = y_test
# # Define the XGBoost parameters to be optimized
# params = {
# 'objective': 'multi:softmax', # For multiclass classification
# 'num_class': len(np.unique(y_train)), # Number of classes
# 'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
# 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
# 'max_depth': trial.suggest_int('max_depth', 3, 10),
# 'subsample': trial.suggest_float('subsample', 0.5, 1.0),
# 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
# 'lambda': trial.suggest_float('lambda', 1e-4, 1.0),
# 'alpha': trial.suggest_float('alpha', 1e-4, 1.0),
# 'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
# 'random_state': 42,
# 'n_jobs': -1
# }
# # Create and train the XGBoost model
# model = xgb.XGBClassifier(**params)
# model.fit(X_train, y_train)
# # Predict on the validation set
# y_pred = model.predict_proba(X_valid)
# # Calculate log loss
# loss = log_loss(y_valid, y_pred)
# return loss
# # Create a study object and optimize the objective function
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)
# # Print the best parameters and their values
# print('Number of finished trials: ', len(study.trials))
# print('Best trial:')
# trial = study.best_trial
# print('Value: ', trial.value)
# print('Params: ')
# for key, value in trial.params.items():
# print(f' {key}: {value}')
xgb_params ={'max_depth': 10,
'min_child_weight': 7,
'learning_rate': 0.03419253503641095,
'n_estimators': 472,
'subsample': 0.8843005833909504,
'colsample_bytree': 0.0966352677605082,
'random_state': 42,
'tree_method': 'hist',
'eval_metric': 'mlogloss',
'device' : 'cuda',
'verbosity': 2, }
model3 = XGBClassifier(**xgb_params).fit(X_train,y_train)
y_hat = model3.predict_proba(X_test)
classes = [0, 1, 2]
logloss = log_loss(y_test, y_hat, labels=classes)
print("Log Loss:", logloss)
#Log Loss: 0.4349277405119213
#Log Loss: 0.4323600045757931
def objective(trial):
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Define hyperparameters to be optimized
n_estimators = trial.suggest_int('n_estimators', 50, 500)
max_depth = trial.suggest_int('max_depth', 5, 30)
min_samples_split = trial.suggest_float('min_samples_split', 0.1, 1.0)
min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.1, 0.5)
# Create ExtraTreesClassifier with suggested hyperparameters
model = ExtraTreesClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
random_state=42
)
# Fit the model on training data
model.fit(X_train, y_train)
# Predict probabilities for the validation set
y_val_pred_proba = model.predict_proba(X_val)
# Calculate log loss
loss = log_loss(y_val, y_val_pred_proba)
return loss
# Assuming X and y are your feature matrix and target vector
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
# Get the best parameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)
# Train the model with the best parameters on the entire dataset
best_model = ExtraTreesClassifier(**best_params, random_state=42)
best_model.fit(X, y)
from sklearn.ensemble import ExtraTreesClassifier
Extra = ExtraTreesClassifier(**{'n_estimators': 362, 'max_depth': 21, 'min_samples_split': 0.12597324652958813, 'min_samples_leaf': 0.10043794568972335}
).fit(X_train,y_train)
y_hat = Extra.predict_proba(X_test)
classes = [0, 1, 2]
logloss = log_loss(y_test, y_hat, labels=classes)
print("Log Loss:", logloss)
# other_xdgb_para "{'max_depth': 10,
# 'min_child_weight': 7,
# 'learning_rate': 0.03419253503641095,
# 'n_estimators': 472,
# 'subsample': 0.8843005833909504,
# 'colsample_bytree': 0.0966352677605082,
# 'random_state': 42,
# 'tree_method': 'hist',
# 'eval_metric': 'mlogloss',
# 'device' : 'cuda',
# 'verbosity': 2, }"
from lightgbm import LGBMClassifier
lgbm_params = {
'max_depth': 9, 'min_child_samples': 14,
'learning_rate': 0.034869481921747415,
'n_estimators': 274, 'min_child_weight': 9,
'subsample': 0.7717873512945741,
'colsample_bytree': 0.1702910221565107,
'reg_alpha': 0.10626128775335533,
'reg_lambda': 0.624196407787772,
'random_state': 42}
model = LGBMClassifier(**lgbm_params).fit(X_train,y_train)
y_hat = model.predict_proba(X_test)
classes = [0, 1, 2]
logloss = log_loss(y_test, y_hat, labels=classes)
print("Log Loss:", logloss)
# Log Loss: 0.4329097797154753
from sklearn.ensemble import RandomForestClassifier
model_tree = RandomForestClassifier(n_estimators=250 , max_depth= 10 ,).fit(X_train,y_train)
y_hat = model_tree.predict_proba(X_test)
classes = [0, 1, 2]
logloss = log_loss(y_test, y_hat, labels=classes)
print("Log Loss:", logloss)
import optuna
def objective(trial):
params = {
'iterations': trial.suggest_int('iterations', 50, 1000),
'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
'depth': trial.suggest_int('depth', 2, 12),
'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10),
'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10)
}
# Create CatBoost classifier with the suggested parameters
clf = CatBoostClassifier(**params)
# Fit the model
clf.fit(X_train, y_train, verbose=False)
# Make predictions on the test set
y_pred_proba = clf.predict_proba(X_test)
# Calculate log loss
logloss = log_loss(y_test, y_pred_proba)
return logloss
# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
# Get the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)
# Train the final model with the best hyperparameters
best_clf = CatBoostClassifier(**best_params)
best_clf.fit(X_train, y_train)
# Make predictions on the test set
final_pred_proba = best_clf.predict_proba(X_test)
# Calculate log loss on the test set
final_logloss = log_loss(y_test, final_pred_proba)
print("Final Log Loss on Test Set:", final_logloss)
cat_params = {'iterations': 469,
'depth': 20,
'min_data_in_leaf': 11,
'learning_rate': 0.13812945166006543,
'grow_policy': 'Lossguide',
'bootstrap_type' : 'Bernoulli'}
cat = CatBoostClassifier(**cat_params).fit(X_train,y_train)
y_hat = cat.predict_proba(X_test)
classes = [0, 1, 2]
logloss = log_loss(y_test, y_hat, labels=classes)
print("Log Loss:", logloss)
xgb2_params = {'objective': 'multi_logloss', 'max_depth': 9, 'min_child_weight': 8, 'learning_rate': 0.0337716365315986, 'n_estimators': 733, 'subsample': 0.6927955384688348, 'colsample_bytree': 0.1234702658812108, 'reg_alpha': 0.18561628377665318, 'reg_lambda': 0.5565488299127089, 'random_state': 42}
xgb2 = XGBClassifier(**xgb2_params).fit(X_train,y_train)
y_hat = xgb2.predict_proba(X_test)
classes = [0, 1, 2]
logloss = log_loss(y_test, y_hat, labels=classes)
print("Log Loss:", logloss)
lgb2_params = {'objective': 'multi_logloss', 'max_depth': 8, 'min_child_samples': 16, 'learning_rate': 0.014553931721109505, 'n_estimators': 779, 'min_child_weight': 9, 'subsample': 0.44799071313755495, 'colsample_bytree': 0.15868021337418978, 'reg_alpha': 0.17992542471160344, 'reg_lambda': 0.8231621177994548, 'random_state': 42}
lgb2 = LGBMClassifier(**lgb2_params).fit(X_train,y_train)
y_hat = lgb2.predict_proba(X_test)
classes = [0, 1, 2]
logloss = log_loss(y_test, y_hat, labels=classes)
print("Log Loss:", logloss)
xgb2 = XGBClassifier(**xgb2_params)
from sklearn.ensemble import HistGradientBoostingClassifier
import optuna
def objective(trial):
# Define hyperparameters to optimize
learning_rate = trial.suggest_float('learning_rate', 1e-4, 1.0, log=True)
max_iter = trial.suggest_int('max_iter', 50, 500)
max_depth = trial.suggest_int('max_depth', 2, 20)
min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 50)
# Create and train the HistGradientBoostingClassifier
model = HistGradientBoostingClassifier(
learning_rate=learning_rate,
max_iter=max_iter,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes,
random_state=42
)
model.fit(X_train, y_train)
# Make probabilistic predictions on the test set for log loss calculation
y_pred_proba = model.predict_proba(X_test)
# Calculate log loss
logloss = log_loss(y_test, y_pred_proba)
return logloss
# Create a study and optimize the objective function
study = optuna.create_study(direction='minimize') # Note the direction is 'minimize' for log loss
study.optimize(objective, n_trials=100)
# Get the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)
# Train the final model with the best hyperparameters
best_model = HistGradientBoostingClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)
# Make probabilistic predictions on the test set for log loss calculation
final_predictions_proba = best_model.predict_proba(X_test)
# Calculate log loss of the final model on the test set
final_logloss = log_loss(y_test, final_predictions_proba)
print("Final Log Loss on Test Set:", final_logloss)
from sklearn.ensemble import HistGradientBoostingClassifier
hist_params = {'l2_regularization': 8.876168706639714,
'early_stopping': False,
'learning_rate': 0.009956485590638034,
'max_iter': 500,
'max_depth': 16,
'max_bins': 255,
'min_samples_leaf': 16,
'max_leaf_nodes': 18,
'random_state': 3}
hist = HistGradientBoostingClassifier(**hist_params).fit(X_train,y_train)
print(log_loss(y_test , hist.predict_proba(X_test)))
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier([('lgbm' , model) , ('xgb' , model3) , ('cat' , cat)],voting = 'soft').fit(X_train,y_train)
y_hat = voting.predict_proba(X_test)
classes = [0, 1, 2]
logloss = log_loss(y_test, y_hat, labels=classes)
print("Log Loss:", logloss)
# Log Loss: 0.4256159895204896
val_logloss = []
cv = KFold(n_splits=5, shuffle=True, random_state=42)
models = []
for i, (train_index,test_index) in enumerate(cv.split(X,y)):
X_train , X_val = X.iloc[train_index], X.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
voting.fit(X_train,y_train)
models.append(voting)
train_preds= voting.predict_proba(X_train)
val_preds= voting.predict_proba(X_val)
train_logloss = log_loss(y_train , train_preds)
test_logloss = log_loss(y_val , val_preds)
val_logloss.append(test_logloss)
print(f'Fold {i+1} \n')
print(f'Train logloss: {train_logloss}')
print(f'Validation logloss: {test_logloss}')
print('-------------------\n')
print(f"Mean Over all folds : {np.mean(val_logloss)}")
#Mean Over all folds : 0.428706267095177 for 3
print(type(model3) , type(model) , type(cat))
from sklearn.metrics import confusion_matrix, f1_score
models = [model3, model, cat,hist,voting]
fig,ax = plt.subplots(1,4,figsize = (25,4))
for i, model in enumerate(models):
y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)
f1= f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes, ax=ax[i])
ax[i].set_title(str(type(model)))
ax[i].set_xlabel('Predicted')
ax[i].set_ylabel('Actual')
print(f'f1_score for {str(type(model))} : {f1}')
print(f'log_loss for {str(type(model))} : {logloss}')
print('-'*100)
fig.tight_layout()
fig.show()
import optuna
def objective(trial):
lgb_weight = trial.suggest_int('lgb_weight', 0, 50)
xgb_weight = trial.suggest_int('xgb_weight', 0, 100 - lgb_weight)
cb_weight = 100- lgb_weight - xgb_weight
weights = [lgb_weight/100, xgb_weight/100, cb_weight/100]
ensemble = VotingClassifier([('lgbm' , model) , ('xgb' , model3) , ('cat' , cat)],voting = 'soft').fit(X_train,y_train)
ensemble.fit(X, y)
y_pred = ensemble.predict(X_test)
return f1_score(y_test, y_pred, average='weighted')
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
# Get the best parameters
best_params = study.best_params
best_weights = [best_params['lgb_weight'], best_params['xgb_weight'], 100- best_params['lgb_weight'] - best_params['xgb_weight']]
print("Best Weights:", best_weights)
from sklearn.ensemble import VotingClassifier
Ensemble = VotingClassifier([('lgbm' , model) , ('xgb' , model3) , ('cat' , cat)],voting = 'soft', weights = [32, 46, 22]).fit(X_train,y_train)
%matplotlib inline
y_pred = Ensemble.predict(X_test)
y_pred_proba = Ensemble.predict_proba(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)
logloss = log_loss(y_test, y_pred_proba)
print("Log Loss:", logloss)
conf_matrix_2 = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix_2, annot=True ,fmt= 'd')
plt.show()
val_logloss = []
cv = KFold(n_splits=5, shuffle=True, random_state=42)
models = []
for i, (train_index,test_index) in enumerate(cv.split(X,y)):
X_train , X_val = X.iloc[train_index], X.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
Ensemble.fit(X_train,y_train)
models.append(Ensemble)
train_preds= Ensemble.predict_proba(X_train)
val_preds= Ensemble.predict_proba(X_val)
train_logloss = log_loss(y_train , train_preds)
test_logloss = log_loss(y_val , val_preds)
val_logloss.append(test_logloss)
print(f'Fold {i+1} \n')
print(f'Train logloss: {train_logloss}')
print(f'Validation logloss: {test_logloss}')
print('-------------------\n')
print(f"Mean Over all folds : {np.mean(val_logloss)}")
# Mean Over all folds : 0.4270813308353992
#Mean Over all folds : 0.428706267095177 for 3
y_pred = voting.predict_proba(test)
from scipy.optimize import minimize
catboost_pred_proba = cat.predict_proba(X_test)
xgboost_pred_proba = model3.predict_proba(X_test)
model6_pred_proba = model.predict_proba(X_test)
# Define the objective function for optimization
def objective(weights):
blended_pred_proba = (
weights[0] * catboost_pred_proba +
weights[1] * xgboost_pred_proba +
weights[2] * model6_pred_proba
)
blend_log_loss = log_loss(y_test, blended_pred_proba)
return blend_log_loss
# Perform grid search to find the best weights
result = minimize(objective, [1/3, 1/3, 1/3], bounds=[(0, 1), (0, 1), (0, 1)], method='L-BFGS-B')
# Get the best weights
best_weights = result.x
print(f'Best Weights: {best_weights}')
# Blend predictions with the best weights
final_blended_pred_proba = (
best_weights[0] * catboost_pred_proba +
best_weights[1] * xgboost_pred_proba +
best_weights[2] * model6_pred_proba
)
# Calculate log loss on the validation set with the best weights
final_blend_log_loss = log_loss(y_test, final_blended_pred_proba)
print(f'Final Blended Log Loss: {final_blend_log_loss}')
model = XGBClassifier(**xgb_params)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
val_logloss = []
models = []
for i, (train_index,test_index) in enumerate(cv.split(X,y)):
X_train , X_val = X.iloc[train_index], X.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
model.fit(X_train,y_train,verbose=False)
models.append(model)
train_preds= model.predict_proba(X_train)
val_preds= model.predict_proba(X_val)
train_logloss = log_loss(y_train , train_preds)
test_logloss = log_loss(y_val , val_preds)
val_logloss.append(test_logloss)
print(f'Fold {i+1} \n')
print(f'Train logloss: {train_logloss}')
print(f'Validation logloss: {test_logloss}')
print('-------------------\n')
print(f"Mean Over all folds : {np.mean(val_logloss)}")
model = XGBClassifier(**xgb2_params)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
val_logloss = []
models = []
for i, (train_index,test_index) in enumerate(cv.split(X,y)):
X_train , X_val = X.iloc[train_index], X.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
model.fit(X_train,y_train,verbose=False)
models.append(model)
train_preds= model.predict_proba(X_train)
val_preds= model.predict_proba(X_val)
train_logloss = log_loss(y_train , train_preds)
test_logloss = log_loss(y_val , val_preds)
val_logloss.append(test_logloss)
print(f'Fold {i+1} \n')
print(f'Train logloss: {train_logloss}')
print(f'Validation logloss: {test_logloss}')
print('-------------------\n')
print(f"Mean Over all folds : {np.mean(val_logloss)}")
model = CatBoostClassifier(**cat_params)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
val_logloss = []
models = []
for i, (train_index,test_index) in enumerate(cv.split(X,y)):
X_train , X_val = X.iloc[train_index], X.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
model.fit(X_train,y_train,verbose=False)
models.append(model)
train_preds= model.predict_proba(X_train)
val_preds= model.predict_proba(X_val)
train_logloss = log_loss(y_train , train_preds)
test_logloss = log_loss(y_val , val_preds)
val_logloss.append(test_logloss)
print(f'Fold {i+1} \n')
print(f'Train logloss: {train_logloss}')
print(f'Validation logloss: {test_logloss}')
print('-------------------\n')
print(f"Mean Over all folds : {np.mean(val_logloss)}")
model = LGBMClassifier(**lgbm_params)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
val_logloss = []
models = []
for i, (train_index,test_index) in enumerate(cv.split(X,y)):
X_train , X_val = X.iloc[train_index], X.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
model.fit(X_train,y_train)
models.append(model)
train_preds= model.predict_proba(X_train)
val_preds= model.predict_proba(X_val)
train_logloss = log_loss(y_train , train_preds)
test_logloss = log_loss(y_val , val_preds)
val_logloss.append(test_logloss)
print(f'Fold {i+1} \n')
print(f'Train logloss: {train_logloss}')
print(f'Validation logloss: {test_logloss}')
print('-------------------\n')
print(f"Mean Over all folds : {np.mean(val_logloss)}")
y_pred = np.array([model.predict_proba(test)/5 for model in models]).sum(axis=0)
log_loss(y_test , model3.predict_proba(X_test))
y_pred = np.array([model.predict_proba(test)/5 for model in models]).sum(axis=0)
y_pred = Ensemble.predict_proba(test)
sub['Status_D'] = y_pred[:,0]
sub['Status_C'] = y_pred[:,1]
sub['Status_CL'] = y_pred[:,2]
sub.to_csv('Cirrhosis5.csv',index=False)
from sklearn.inspection import PartialDependenceDisplay
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize = (15, 15))
fig.suptitle('Partial Dependence Plots of Status = C', fontsize = 16)
HG_disp = PartialDependenceDisplay.from_estimator(voting, X, ['Bilirubin mean by Drug', 'Prothrombin', 'Alk_Phos'],
pd_line_kw = {"color": "red"},
ice_lines_kw = {"color": "steelblue"},
kind = 'both',
target = 0,
response_method = 'predict_proba',
ax = ax1)
ax1.set_title('voting Partial Dependency Plots')
LGBM_disp = PartialDependenceDisplay.from_estimator(model, X, ['Bilirubin mean by Drug', 'Prothrombin', 'Alk_Phos'],
pd_line_kw = {"color": "red"},
ice_lines_kw = {"color": "steelblue"},
kind = 'both',
target = 0,
response_method = 'predict_proba',
ax = ax2)
ax2.set_title('LGBM Partial Dependency Plots')
XGB_disp = PartialDependenceDisplay.from_estimator(model3, X, ['Bilirubin mean by Drug', 'Prothrombin', 'Alk_Phos'],
pd_line_kw = {"color": "red"},
ice_lines_kw = {"color": "steelblue"},
kind = 'both',
target = 0,
response_method = 'predict_proba',
ax = ax3)
ax3.set_title('XGBoost Partial Dependency Plots')
plt.savefig('D_partial_dependency_plots.png');
y.value_counts()
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize = (15, 15))
fig.suptitle('Partial Dependence Plots of Status = C', fontsize = 16)
HG_disp = PartialDependenceDisplay.from_estimator(voting, X, ['Bilirubin mean by Drug', 'Prothrombin', 'Alk_Phos'],
pd_line_kw = {"color": "red"},
ice_lines_kw = {"color": "steelblue"},
kind = 'both',
target = 1,
response_method = 'predict_proba',
ax = ax1)
ax1.set_title('voting Partial Dependency Plots')
LGBM_disp = PartialDependenceDisplay.from_estimator(model, X, ['Bilirubin mean by Drug', 'Prothrombin', 'Alk_Phos'],
pd_line_kw = {"color": "red"},
ice_lines_kw = {"color": "steelblue"},
kind = 'both',
target = 1,
response_method = 'predict_proba',
ax = ax2)
ax2.set_title('LGBM Partial Dependency Plots')
XGB_disp = PartialDependenceDisplay.from_estimator(model3, X, ['Bilirubin mean by Drug', 'Prothrombin', 'Alk_Phos'],
pd_line_kw = {"color": "red"},
ice_lines_kw = {"color": "steelblue"},
kind = 'both',
target = 1,
response_method = 'predict_proba',
ax = ax3)
ax3.set_title('XGBoost Partial Dependency Plots')
plt.savefig('C_partial_dependency_plots.png');
X.columns
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize = (15, 15))
fig.suptitle('Partial Dependence Plots of Status = C', fontsize = 16)
HG_disp = PartialDependenceDisplay.from_estimator(voting, X, [ 'SGOT', 'Copper', 'Platelets'],
pd_line_kw = {"color": "red"},
ice_lines_kw = {"color": "steelblue"},
kind = 'both',
target = 0,
response_method = 'predict_proba',
ax = ax1)
ax1.set_title('voting Partial Dependency Plots')
LGBM_disp = PartialDependenceDisplay.from_estimator(model, X,[ 'SGOT', 'Copper', 'Platelets'],
pd_line_kw = {"color": "red"},
ice_lines_kw = {"color": "steelblue"},
kind = 'both',
target = 0,
response_method = 'predict_proba',
ax = ax2)
ax2.set_title('LGBM Partial Dependency Plots')
XGB_disp = PartialDependenceDisplay.from_estimator(model3, X, [ 'SGOT', 'Copper', 'Platelets'],
pd_line_kw = {"color": "red"},
ice_lines_kw = {"color": "steelblue"},
kind = 'both',
target = 0,
response_method = 'predict_proba',
ax = ax3)
ax3.set_title('XGBoost Partial Dependency Plots')
plt.savefig('idk.png');